Loading Packages

knitr::opts_chunk$set(echo = TRUE, message = FALSE)
#load required packages
library(tidyverse)
library(dplyr)
library(ggplot2)
library(plotly)
library(rworldmap)

Basic Data Information

df <- read.csv("netflix_titles.csv", stringsAsFactors = F) # read data

summary(df) # summary statistics
##    show_id              type              title             director        
##  Length:7787        Length:7787        Length:7787        Length:7787       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      cast             country           date_added         release_year 
##  Length:7787        Length:7787        Length:7787        Min.   :1925  
##  Class :character   Class :character   Class :character   1st Qu.:2013  
##  Mode  :character   Mode  :character   Mode  :character   Median :2017  
##                                                           Mean   :2014  
##                                                           3rd Qu.:2018  
##                                                           Max.   :2021  
##     rating            duration          listed_in         description       
##  Length:7787        Length:7787        Length:7787        Length:7787       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
## 
length(df$release_year) # number of observations
## [1] 7787
length(unique(df[,"director"])) # number of unique values - it has duplicates
## [1] 4050
sapply(df, function(x) sum(is.na(x) | x == 0 | x == "")) # number of NA, 0, and blank values
##      show_id         type        title     director         cast      country 
##            0            0            0         2389          718          507 
##   date_added release_year       rating     duration    listed_in  description 
##           10            0            7            0            0            0
sapply(df, function(x) sum(is.na(x))) # no NA
##      show_id         type        title     director         cast      country 
##            0            0            0            0            0            0 
##   date_added release_year       rating     duration    listed_in  description 
##            0            0            0            0            0            0
sapply(df, function(x) sum(x == 0)) # no 0 values
##      show_id         type        title     director         cast      country 
##            0            0            0            0            0            0 
##   date_added release_year       rating     duration    listed_in  description 
##            0            0            0            0            0            0
sapply(df, function(x) sum(x == "")) # every null values are blank values
##      show_id         type        title     director         cast      country 
##            0            0            0         2389          718          507 
##   date_added release_year       rating     duration    listed_in  description 
##           10            0            7            0            0            0

Visualization

Movie vs TV show

# create data frame for number of contents by type
type <- df %>%
  group_by(type) %>%
  summarise(count = n())
type
## # A tibble: 2 x 2
##   type    count
##   <chr>   <int>
## 1 Movie    5377
## 2 TV Show  2410
#all plots created afterwards will have title at center
#theme_update(plot.title = element_text(hjust = 0.5)) 

# create bar plot for Netflix contents by type
bar <- ggplot(type, aes(x = type, y = count, fill = type)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = count), vjust = -0.3) +
  labs(title = "Netflix contents by type", x = "Type", fill = "Type") + # add title
  theme(plot.title = element_text(hjust = 0.5)) # center title
bar

# create data frames each for movies and TV shows for further analysis
movie <- df[(df$type == "Movie"),]
tv <- df[(df$type == "TV Show"),]

Number of movies on Netflix is more than double the number of TV shows.

Netflix Movie Analysis

Top 10 Countries by number of movies on Netflix

  1. Bar plot
# basic information about movies on Netflix
length(unique(movie$country)) # country column has many duplicates
## [1] 591
head(movie$country, 50) # some movies has listed multiple countries separated by comma
##  [1] "Mexico"                         "Singapore"                     
##  [3] "United States"                  "United States"                 
##  [5] "Egypt"                          "United States"                 
##  [7] "India"                          "India"                         
##  [9] "United States"                  "Thailand"                      
## [11] "United States"                  "Nigeria"                       
## [13] "Norway, Iceland, United States" "India"                         
## [15] "United Kingdom"                 "India"                         
## [17] "India"                          "India"                         
## [19] "India"                          "United States"                 
## [21] "South Korea"                    "Italy"                         
## [23] "Canada"                         "Indonesia"                     
## [25] "Indonesia"                      "United States"                 
## [27] "Canada"                         "United States"                 
## [29] "Romania"                        "Romania"                       
## [31] "Spain"                          "Turkey"                        
## [33] "Iceland"                        "Turkey"                        
## [35] "Nigeria"                        "United States"                 
## [37] "United States"                  "United States"                 
## [39] "South Africa, Nigeria"          "France"                        
## [41] "United States, South Africa"    "Spain"                         
## [43] "Portugal, Spain"                "United States"                 
## [45] "United States"                  "Indonesia"                     
## [47] "India"                          "United States"                 
## [49] "United States"                  "United States"
# split comma separated country names
country <- unlist(strsplit(movie$country, ", ")) # spliy country names at comma
unique(country) # some country names has comma at the end
##   [1] "Mexico"               "Singapore"            "United States"       
##   [4] "Egypt"                "India"                "Thailand"            
##   [7] "Nigeria"              "Norway"               "Iceland"             
##  [10] "United Kingdom"       "South Korea"          "Italy"               
##  [13] "Canada"               "Indonesia"            "Romania"             
##  [16] "Spain"                "Turkey"               "South Africa"        
##  [19] "France"               "Portugal"             "Hong Kong"           
##  [22] "China"                "Germany"              "Argentina"           
##  [25] "Serbia"               "Denmark"              "Poland"              
##  [28] "Japan"                "Kenya"                "New Zealand"         
##  [31] "Pakistan"             "Australia"            "Taiwan"              
##  [34] "Netherlands"          "Philippines"          "United Arab Emirates"
##  [37] "Brazil"               "Iran"                 "Belgium"             
##  [40] "Israel"               "Uruguay"              "Bulgaria"            
##  [43] "Chile"                "Colombia"             "Algeria"             
##  [46] "Soviet Union"         "Sweden"               "Malaysia"            
##  [49] "Ireland"              "Luxembourg"           "Austria"             
##  [52] "Peru"                 "Senegal"              "Switzerland"         
##  [55] "Ghana"                "Saudi Arabia"         "Armenia"             
##  [58] "Jordan"               "Mongolia"             "Namibia"             
##  [61] "Finland"              "Lebanon"              "Qatar"               
##  [64] "Vietnam"              "Russia"               "Malta"               
##  [67] "Kuwait"               "Czech Republic"       "Bahamas"             
##  [70] "Sri Lanka"            "Cayman Islands"       "Bangladesh"          
##  [73] "United States,"       "Zimbabwe"             "Hungary"             
##  [76] "Latvia"               "Liechtenstein"        "Venezuela"           
##  [79] "Morocco"              "Cambodia"             "Albania"             
##  [82] "Nicaragua"            "Greece"               "Cambodia,"           
##  [85] "Croatia"              "Guatemala"            "West Germany"        
##  [88] "Poland,"              "Slovenia"             "Dominican Republic"  
##  [91] "Nepal"                "Samoa"                "Bermuda"             
##  [94] "Ecuador"              "Georgia"              "Botswana"            
##  [97] "Iraq"                 "Vatican City"         "Angola"              
## [100] "Jamaica"              "Kazakhstan"           "Malawi"              
## [103] "Slovakia"             "Lithuania"            "Afghanistan"         
## [106] "Paraguay"             "Somalia"              "Sudan"               
## [109] "Panama"               "United Kingdom,"      "Uganda"              
## [112] "East Germany"         "Ukraine"              "Montenegro"
# remove leftover commas from country names
country <- gsub(",","", country)
unique(country)
##   [1] "Mexico"               "Singapore"            "United States"       
##   [4] "Egypt"                "India"                "Thailand"            
##   [7] "Nigeria"              "Norway"               "Iceland"             
##  [10] "United Kingdom"       "South Korea"          "Italy"               
##  [13] "Canada"               "Indonesia"            "Romania"             
##  [16] "Spain"                "Turkey"               "South Africa"        
##  [19] "France"               "Portugal"             "Hong Kong"           
##  [22] "China"                "Germany"              "Argentina"           
##  [25] "Serbia"               "Denmark"              "Poland"              
##  [28] "Japan"                "Kenya"                "New Zealand"         
##  [31] "Pakistan"             "Australia"            "Taiwan"              
##  [34] "Netherlands"          "Philippines"          "United Arab Emirates"
##  [37] "Brazil"               "Iran"                 "Belgium"             
##  [40] "Israel"               "Uruguay"              "Bulgaria"            
##  [43] "Chile"                "Colombia"             "Algeria"             
##  [46] "Soviet Union"         "Sweden"               "Malaysia"            
##  [49] "Ireland"              "Luxembourg"           "Austria"             
##  [52] "Peru"                 "Senegal"              "Switzerland"         
##  [55] "Ghana"                "Saudi Arabia"         "Armenia"             
##  [58] "Jordan"               "Mongolia"             "Namibia"             
##  [61] "Finland"              "Lebanon"              "Qatar"               
##  [64] "Vietnam"              "Russia"               "Malta"               
##  [67] "Kuwait"               "Czech Republic"       "Bahamas"             
##  [70] "Sri Lanka"            "Cayman Islands"       "Bangladesh"          
##  [73] "Zimbabwe"             "Hungary"              "Latvia"              
##  [76] "Liechtenstein"        "Venezuela"            "Morocco"             
##  [79] "Cambodia"             "Albania"              "Nicaragua"           
##  [82] "Greece"               "Croatia"              "Guatemala"           
##  [85] "West Germany"         "Slovenia"             "Dominican Republic"  
##  [88] "Nepal"                "Samoa"                "Bermuda"             
##  [91] "Ecuador"              "Georgia"              "Botswana"            
##  [94] "Iraq"                 "Vatican City"         "Angola"              
##  [97] "Jamaica"              "Kazakhstan"           "Malawi"              
## [100] "Slovakia"             "Lithuania"            "Afghanistan"         
## [103] "Paraguay"             "Somalia"              "Sudan"               
## [106] "Panama"               "Uganda"               "East Germany"        
## [109] "Ukraine"              "Montenegro"
# store the list as data frame
movie_country <- data.frame(country = country)
unique(movie_country$country) # it now has clean country names
##   [1] "Mexico"               "Singapore"            "United States"       
##   [4] "Egypt"                "India"                "Thailand"            
##   [7] "Nigeria"              "Norway"               "Iceland"             
##  [10] "United Kingdom"       "South Korea"          "Italy"               
##  [13] "Canada"               "Indonesia"            "Romania"             
##  [16] "Spain"                "Turkey"               "South Africa"        
##  [19] "France"               "Portugal"             "Hong Kong"           
##  [22] "China"                "Germany"              "Argentina"           
##  [25] "Serbia"               "Denmark"              "Poland"              
##  [28] "Japan"                "Kenya"                "New Zealand"         
##  [31] "Pakistan"             "Australia"            "Taiwan"              
##  [34] "Netherlands"          "Philippines"          "United Arab Emirates"
##  [37] "Brazil"               "Iran"                 "Belgium"             
##  [40] "Israel"               "Uruguay"              "Bulgaria"            
##  [43] "Chile"                "Colombia"             "Algeria"             
##  [46] "Soviet Union"         "Sweden"               "Malaysia"            
##  [49] "Ireland"              "Luxembourg"           "Austria"             
##  [52] "Peru"                 "Senegal"              "Switzerland"         
##  [55] "Ghana"                "Saudi Arabia"         "Armenia"             
##  [58] "Jordan"               "Mongolia"             "Namibia"             
##  [61] "Finland"              "Lebanon"              "Qatar"               
##  [64] "Vietnam"              "Russia"               "Malta"               
##  [67] "Kuwait"               "Czech Republic"       "Bahamas"             
##  [70] "Sri Lanka"            "Cayman Islands"       "Bangladesh"          
##  [73] "Zimbabwe"             "Hungary"              "Latvia"              
##  [76] "Liechtenstein"        "Venezuela"            "Morocco"             
##  [79] "Cambodia"             "Albania"              "Nicaragua"           
##  [82] "Greece"               "Croatia"              "Guatemala"           
##  [85] "West Germany"         "Slovenia"             "Dominican Republic"  
##  [88] "Nepal"                "Samoa"                "Bermuda"             
##  [91] "Ecuador"              "Georgia"              "Botswana"            
##  [94] "Iraq"                 "Vatican City"         "Angola"              
##  [97] "Jamaica"              "Kazakhstan"           "Malawi"              
## [100] "Slovakia"             "Lithuania"            "Afghanistan"         
## [103] "Paraguay"             "Somalia"              "Sudan"               
## [106] "Panama"               "Uganda"               "East Germany"        
## [109] "Ukraine"              "Montenegro"
# count country names appearing in the original dataset
movie_country <- movie_country %>%
  group_by(country) %>%
  summarise(count = n())
head(movie_country,10)
## # A tibble: 10 x 2
##    country     count
##    <chr>       <int>
##  1 Afghanistan     1
##  2 Albania         1
##  3 Algeria         2
##  4 Angola          1
##  5 Argentina      64
##  6 Armenia         1
##  7 Australia      84
##  8 Austria        10
##  9 Bahamas         1
## 10 Bangladesh      3
# top 10 countries by number of movies added to Netflix
bar <- movie_country %>%
  arrange(desc(count)) %>% # most to least
  slice(1:10) %>% # top 10 countries
  ggplot(., aes(x=reorder(country,-count), y = count)) + # bar plot most to least
  geom_bar(stat='identity') + 
  theme_classic() +
  labs(x = "Country", title = "Top 10 Countries by number of movies on Netflix") +
  geom_text(aes(label = count), vjust = -0.3) # add count labels
bar

United States has the most number of movies on Netflix.

  1. World Map
spdf <- joinCountryData2Map(movie_country, joinCode="NAME", nameJoinColumn="country")
mapParams <- mapCountryData(spdf,
               nameColumnToPlot="count",
               catMethod=c(0,1,3,5,10,50,100,300,500,1000,2500),
               mapTitle = "Number of Movies added to Netflix",
               addLegend = FALSE)
do.call(addMapLegend, c(mapParams, legendLabels="all", legendWidth=0.5))

#labelCountries()

Continent-wise, Americas - North and South America - exhibits high number of Netflix movies overall.

Number of movies by Year

year 지날때마다 각 나라의 영화 개수 line graph - x = year, y = count, fill = country

genre_country <- movie %>% 
  mutate(listed_in = strsplit(as.character(listed_in), ", ")) %>% # separate genre by commas
  unnest(listed_in) %>%
  mutate(country = strsplit(as.character(country), ", ")) %>% # separate country name by commas
  unnest(country) %>%
  select("title","country","date_added","listed_in")
genre_country
## # A tibble: 14,342 x 4
##    title country       date_added listed_in           
##    <chr> <chr>         <chr>      <chr>               
##  1 7:19  Mexico        2016       Dramas              
##  2 7:19  Mexico        2016       International Movies
##  3 23:59 Singapore     2018       Horror Movies       
##  4 23:59 Singapore     2018       International Movies
##  5 9     United States 2017       Action & Adventure  
##  6 9     United States 2017       Independent Movies  
##  7 9     United States 2017       Sci-Fi & Fantasy    
##  8 21    United States 2020       Dramas              
##  9 122   Egypt         2020       Horror Movies       
## 10 122   Egypt         2020       International Movies
## # … with 14,332 more rows
unique(genre_country$country) # some country names still have commas
##   [1] "Mexico"               "Singapore"            "United States"       
##   [4] "Egypt"                "India"                "Thailand"            
##   [7] "Nigeria"              "Norway"               "Iceland"             
##  [10] "United Kingdom"       "South Korea"          "Italy"               
##  [13] "Canada"               "Indonesia"            "Romania"             
##  [16] "Spain"                "Turkey"               "South Africa"        
##  [19] "France"               "Portugal"             "Hong Kong"           
##  [22] "China"                "Germany"              "Argentina"           
##  [25] "Serbia"               "Denmark"              "Poland"              
##  [28] "Japan"                "Kenya"                "New Zealand"         
##  [31] "Pakistan"             "Australia"            "Taiwan"              
##  [34] "Netherlands"          "Philippines"          "United Arab Emirates"
##  [37] "Brazil"               "Iran"                 "Belgium"             
##  [40] "Israel"               "Uruguay"              "Bulgaria"            
##  [43] "Chile"                "Colombia"             "Algeria"             
##  [46] "Soviet Union"         "Sweden"               "Malaysia"            
##  [49] "Ireland"              "Luxembourg"           "Austria"             
##  [52] "Peru"                 "Senegal"              "Switzerland"         
##  [55] "Ghana"                "Saudi Arabia"         "Armenia"             
##  [58] "Jordan"               "Mongolia"             "Namibia"             
##  [61] "Finland"              "Lebanon"              "Qatar"               
##  [64] "Vietnam"              "Russia"               "Malta"               
##  [67] "Kuwait"               "Czech Republic"       "Bahamas"             
##  [70] "Sri Lanka"            "Cayman Islands"       "Bangladesh"          
##  [73] "United States,"       "Zimbabwe"             "Hungary"             
##  [76] "Latvia"               "Liechtenstein"        "Venezuela"           
##  [79] "Morocco"              "Cambodia"             "Albania"             
##  [82] "Nicaragua"            "Greece"               "Cambodia,"           
##  [85] "Croatia"              "Guatemala"            "West Germany"        
##  [88] "Poland,"              "Slovenia"             "Dominican Republic"  
##  [91] "Nepal"                "Samoa"                "Bermuda"             
##  [94] "Ecuador"              "Georgia"              "Botswana"            
##  [97] "Iraq"                 "Vatican City"         "Angola"              
## [100] "Jamaica"              "Kazakhstan"           "Malawi"              
## [103] "Slovakia"             "Lithuania"            "Afghanistan"         
## [106] "Paraguay"             "Somalia"              "Sudan"               
## [109] "Panama"               "United Kingdom,"      "Uganda"              
## [112] "East Germany"         "Ukraine"              "Montenegro"
length(unique(genre_country$country)) # number of unique country names
## [1] 114
unique(genre_country$listed_in) # clean; no leftover commas to be deleted
##  [1] "Dramas"                   "International Movies"    
##  [3] "Horror Movies"            "Action & Adventure"      
##  [5] "Independent Movies"       "Sci-Fi & Fantasy"        
##  [7] "Thrillers"                "Documentaries"           
##  [9] "Sports Movies"            "Comedies"                
## [11] "Romantic Movies"          "Movies"                  
## [13] "Music & Musicals"         "LGBTQ Movies"            
## [15] "Faith & Spirituality"     "Children & Family Movies"
## [17] "Classic Movies"           "Cult Movies"             
## [19] "Stand-Up Comedy"          "Anime Features"
genre_country <- genre_country %>%
  mutate(country = gsub(",","",country)) # remove commas by replacing them with space
genre_country
## # A tibble: 14,342 x 4
##    title country       date_added listed_in           
##    <chr> <chr>         <chr>      <chr>               
##  1 7:19  Mexico        2016       Dramas              
##  2 7:19  Mexico        2016       International Movies
##  3 23:59 Singapore     2018       Horror Movies       
##  4 23:59 Singapore     2018       International Movies
##  5 9     United States 2017       Action & Adventure  
##  6 9     United States 2017       Independent Movies  
##  7 9     United States 2017       Sci-Fi & Fantasy    
##  8 21    United States 2020       Dramas              
##  9 122   Egypt         2020       Horror Movies       
## 10 122   Egypt         2020       International Movies
## # … with 14,332 more rows
unique(genre_country$country) # commas removed from country names
##   [1] "Mexico"               "Singapore"            "United States"       
##   [4] "Egypt"                "India"                "Thailand"            
##   [7] "Nigeria"              "Norway"               "Iceland"             
##  [10] "United Kingdom"       "South Korea"          "Italy"               
##  [13] "Canada"               "Indonesia"            "Romania"             
##  [16] "Spain"                "Turkey"               "South Africa"        
##  [19] "France"               "Portugal"             "Hong Kong"           
##  [22] "China"                "Germany"              "Argentina"           
##  [25] "Serbia"               "Denmark"              "Poland"              
##  [28] "Japan"                "Kenya"                "New Zealand"         
##  [31] "Pakistan"             "Australia"            "Taiwan"              
##  [34] "Netherlands"          "Philippines"          "United Arab Emirates"
##  [37] "Brazil"               "Iran"                 "Belgium"             
##  [40] "Israel"               "Uruguay"              "Bulgaria"            
##  [43] "Chile"                "Colombia"             "Algeria"             
##  [46] "Soviet Union"         "Sweden"               "Malaysia"            
##  [49] "Ireland"              "Luxembourg"           "Austria"             
##  [52] "Peru"                 "Senegal"              "Switzerland"         
##  [55] "Ghana"                "Saudi Arabia"         "Armenia"             
##  [58] "Jordan"               "Mongolia"             "Namibia"             
##  [61] "Finland"              "Lebanon"              "Qatar"               
##  [64] "Vietnam"              "Russia"               "Malta"               
##  [67] "Kuwait"               "Czech Republic"       "Bahamas"             
##  [70] "Sri Lanka"            "Cayman Islands"       "Bangladesh"          
##  [73] "Zimbabwe"             "Hungary"              "Latvia"              
##  [76] "Liechtenstein"        "Venezuela"            "Morocco"             
##  [79] "Cambodia"             "Albania"              "Nicaragua"           
##  [82] "Greece"               "Croatia"              "Guatemala"           
##  [85] "West Germany"         "Slovenia"             "Dominican Republic"  
##  [88] "Nepal"                "Samoa"                "Bermuda"             
##  [91] "Ecuador"              "Georgia"              "Botswana"            
##  [94] "Iraq"                 "Vatican City"         "Angola"              
##  [97] "Jamaica"              "Kazakhstan"           "Malawi"              
## [100] "Slovakia"             "Lithuania"            "Afghanistan"         
## [103] "Paraguay"             "Somalia"              "Sudan"               
## [106] "Panama"               "Uganda"               "East Germany"        
## [109] "Ukraine"              "Montenegro"
length(unique(genre_country$country))
## [1] 110
movie_year <- genre_country %>%
#  mutate(date_added = as.numeric(date_added)) %>%
  filter(country == "United States" |
           country == "India"| 
           country == "United Kingdom"| 
           country == "Canada"| 
           country == "France") %>%
  group_by(country, date_added) %>%
  summarise(count = n())
movie_year
## # A tibble: 45 x 3
## # Groups:   country [5]
##    country date_added count
##    <chr>   <chr>      <int>
##  1 Canada  2013           1
##  2 Canada  2014           2
##  3 Canada  2015           5
##  4 Canada  2016          28
##  5 Canada  2017         116
##  6 Canada  2018         125
##  7 Canada  2019         125
##  8 Canada  2020         149
##  9 Canada  2021          10
## 10 France  2011           2
## # … with 35 more rows
#bar <- ggplot(genre_5_country, aes(fill=listed_in, y=count, x=country)) + 
#    geom_bar(position="fill", stat="identity")
#bar

chart <- ggplot(movie_year, aes(x = date_added, y = count, group = country, color = country)) +
  geom_line() +
  labs(x = "Year", y = "Count", fill = "Country", title = "Number of movies by Year in Top 5 Countries")
chart <- ggplotly(chart)
chart